# The Distributional Consequences of Technological Change: Worker-Level Evidence
# Thomas Kurer and Aina Gallego

# Analysis, Tables, Figure (exc. Fig 4)

#### Header
rm(list=ls())

# if you are missing any of the following libraries, uncomment the next line and execute the command
# install.packages(c("readstata13", "tidyverse", "ggplot2", "zoo", "ggalluvial", "xtable", "scales", "reshape2"), dependencies=T)
library(readstata13)
library(tidyverse)
library(ggplot2)
library(zoo)
library(ggalluvial)
library(xtable)
library(scales)
library(reshape2)

# set your working directory
setwd("INSERTYOURPATH")

####################################################################################################
#### Data

tech <- read.dta13("workerLevelEvidence.dta", convert.factors=F)

# output path for saving plots
path <- "INSERTYOUROUTPUTPATH/"

####################################################################################################
#### Output

desc <- tech %>% 
  filter(!is.na(task3)) %>% 
  group_by(year) %>% 
  mutate(one=1, total=sum(one)) %>% 
  group_by(task3, year) %>% 
  mutate(nr=sum(one), share=nr/total, meanincreal=mean(incomem, na.rm=T), meanage=mean(age, na.rm=T))

desc$task3 <- as.factor(desc$task3)
levels(desc$task3) <- c("Non-Routine Cognitive", "Routine", "Non-Routine Manual")

# Figure 1 ----

share <- ggplot(desc, aes(x=year, y=share, group=factor(task3), color=factor(task3))) + 
  geom_line(size=1) + 
  ylim(0.2, .45) + ylab("Relative Share of Labor Force") + xlab("Year") +
  theme_bw() +
  theme(text = element_text(size=20), legend.title=element_blank())
share
ggsave(paste(path, "share_revision.eps", sep=""), width = 10)

# Figure 2 ----

# Panel a
increal <- ggplot(desc, aes(x=year, y=meanincreal, group=factor(task3), color=factor(task3))) + 
  geom_line(size=1) + 
  ylim(250, 2000) + ylab("") + xlab("Year") +
  theme_bw() +
  theme(text = element_text(size=20),legend.title=element_blank())
increal
ggsave(paste(path, "increal_revision.eps", sep=""), width=10)

# Panel b
age <- ggplot(desc, aes(x=year, y=meanage, group=factor(task3), color=factor(task3))) + 
  geom_line(size=1) + 
  ylim(35, 45) + ylab("") + xlab("Year") +
  theme_bw() +
  theme(text = element_text(size=20),legend.title=element_blank())
age
ggsave(paste(path, "age_revision.eps", sep=""), width=10)


# Preparation Figure 3 ----
# unemployment often missing when occupation (--> task) missing
# need to carry forward missing task values


unemp <- tech %>%
  arrange(pidp, year) %>% group_by(pidp) %>% select(pidp, year, task3, unemp, age) 

unemp <- unemp %>%
  group_by(pidp) %>% mutate(task3imp=zoo::na.locf(task3, na.rm = FALSE))

unemp <- unemp %>%
  ungroup()

trans <- unemp %>%
  group_by(pidp) %>%
  mutate(firsttask = task3[which(!is.na(task3))[1]], lasttask = task3imp[year==max(year)], lastunemp=unemp[year==max(year)], firstyear = year[which(!is.na(task3))[1]], lastyear=year[year==max(year)])

trans$last <- ifelse(trans$lastunemp==1, 4, trans$lasttask)
trans$lastbroad <- ifelse(trans$age>64, 5, trans$last)
trans$spell <- trans$lastyear-trans$firstyear

translast <- trans %>% group_by(pidp) %>% filter(year == max(year))

transitionsb <- prop.table(table(translast$firsttask, translast$lastbroad),1)

perc <- transitionsb[2,]
# Figure 3 ----

riverb <- data.frame( Start=c ( rep("1 nrc", 5), rep("2 r", 5), rep("3 nrm", 5) ), End = rep(c("1 nrc", "2 r", "3 nrm", "4 u", "5 ret"), 3) , freq=c(transitionsb[1,], transitionsb[2,], transitionsb[3,]) )

levels(riverb$Start) <- c("Non-R Cog", "Routine", "Non-R Man")
levels(riverb$End) <- c("Non-R Cog", "Routine", "Non-R Man", "Unemp", "Retired")


A_col <- "grey"
B_col <- "darkorange1"
C_col <- "grey"
D_col <- "grey"
E_col <- "grey"

alpha <- 0.7 # transparency value
fct_levels <- c("A","C","B")


riverplotb <- ggplot(riverb,
                    aes(weight = freq, axis1 = Start, axis2 = End)) +
  geom_alluvium(aes(fill = Start, color = Start), 
                width = 1/12, alpha = alpha, knot.pos = 0.4) +
  geom_stratum(width = 1/5,color = "black") +
  scale_fill_manual(values  = c(A_col, B_col, C_col, D_col, E_col)) +
  scale_color_manual(values = c(A_col, B_col, C_col, D_col, E_col)) +
  geom_text(stat = "stratum", label.strata = TRUE, size=4) +
  scale_x_continuous(breaks = 1:2, labels = c("", "")) +
  geom_text(x=1.8, y=2.1, label=paste0(round((perc[1]*100),1),"%")) +
  geom_text(x=1.8, y=1.6, label=paste0(round((perc[2]*100),1),"%")) +
  geom_text(x=1.8, y=1.1, label=paste0(round((perc[3]*100),1),"%")) +
  geom_text(x=1.8, y=0.3, label=paste0(round((perc[4]*100),1),"%")) +
  geom_text(x=1.8, y=0.1, label=paste0(round((perc[5]*100),1),"%")) +
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    axis.text.y = element_blank(),
    axis.text.x = element_text(size = 18, face = "bold")
  )

riverplotb
ggsave(paste(path, "riverplot_perc.eps", sep=""), width=8, height=7, device=cairo_ps)

# vary time windows

rout <- trans %>% filter(firsttask==2)

rout$count <- rout$year - rout$firstyear
rout$count[rout$count<0] <- NA #some cases, because first year = first year with !is.na(task)


leads <- seq(22)
lead_names <- paste("lead", formatC(leads, width = nchar(max(leads)), flag = "0"), 
                    sep = "_")
lead_functions <- setNames(paste("dplyr::lead(., ", leads, ")"), lead_names)

rout <- rout %>% mutate_at(vars(task3, unemp, age), funs_(lead_functions))

## Gen lead indices

indices <- c(paste0('0', 1:9), 10:22)

## run Loop

for (index in indices) {
  
  ## Make the variable
  
  hanno_temp <- rout[, paste0('task3_lead_', index)]
  hanno_temp[rout[, paste0('unemp_lead_', index)] == 1] <- 4
  hanno_temp[rout[, paste0('age_lead_', index)] > 64] <- 5
  
  ## Add to DF
  
  rout[, paste0('taskbroad_lead_', index)] <- hanno_temp
}

transitions <- rout %>%
  group_by(pidp) %>%
  mutate(firstyear = year[which(!is.na(task3))[1]], lastyear=max(year)) %>%
  filter(year==lastyear) # firstyear not suitable because selection of task based on year 1

transitions_leads <- matrix(ncol=6, nrow=21)

coltask <- which(colnames(transitions)=="task3")
collead1 <- which(colnames(transitions)=="taskbroad_lead_01")-1

for (i in 1:21) {
  for (j in 1:5) {
    transitions_leads[i,j] <- prop.table(table(rout[[coltask]], rout[[collead1+i]]),1)[2,j]
    transitions_leads[i,6] <- addmargins(table(rout$task3, rout[[collead1+i]]))[2,6]
  }
}

transitions_leads <- as.data.frame(transitions_leads)
spell <- data.frame(spell=c(1:21))
transitions_leads <- cbind(spell, transitions_leads)
transitions_leads <- transitions_leads %>% dplyr::rename(Upgrader=V1, Survivor=V2, Downgrader=V3, Unemployed=V4, Retired=V5, N=V6)

transitions_leads_long <- melt(transitions_leads, id.vars = c("spell", "N"))

# find xintercept
xval <- transitions_leads_long %>% filter(variable=="Survivor", value>.6&value<.66) %>% select(spell, value)
vals <- (value=c(xval[2,2], rep(NA,9), xval[1,2]))
xcont <- (spell=seq(4,5,0.1))
xintercept <- as.data.frame(cbind(xcont,vals))
rowxintercept <- which.min(abs(approxfun(1:11, xintercept$vals)(1:11) - transitionsb[2,2])) 
xintercept <- xintercept[rowxintercept,1]

transitions_leads_long$variablesort <- factor(transitions_leads_long$variable, levels(transitions_leads_long$variable)[c(5,2,1,3,4)])

ggplot(transitions_leads_long) + 
  geom_line(aes(x=spell, y=value, group=variablesort, color=variablesort), size=1) +
  geom_vline(xintercept=xintercept, linetype="dotted") +
  ylab("Share of Workes (Routine in Year 0)") + 
  xlab("Sample: Time Window (in Years) Between First and Last Observation") +
  labs(color="") +
  theme_bw() +
  theme(text = element_text(size=15),legend.title=element_blank())

ggsave(paste(path, "transitions_overtime.eps", sep=""), width=8, height=6.8)


# Appendix ----


tableA1 <- tech %>% 
  filter(year>1996) %>% 
  group_by(year) %>% 
  summarise(mean=mean(ICT_hours, na.rm=T), sd=sd(ICT_hours, na.rm=T), min=min(ICT_hours, na.rm=T), max=max(ICT_hours, na.rm=T))

# Appendix Table 1 ----
xtable(tableA1)
print(xtable(tableA1, type = "latex", digits=c(0,0,3,3,3,3)), file = paste(path, "tableA1.tex", sep=""))

# Appendix Table 2 ----
xtable(transitionsb, digits=3)
print(xtable(transitionsb, type = "latex", digits=3), file = paste(path, "tableA2.tex", sep=""))

# Appendix Figure 5 ----

# read in with factors for industry labels
df = read.dta13("workerLevelEvidence.dta", nonint.factors = TRUE) 

df2 = df %>%
  filter(year >1997,
         age > 17,
         age < 65,
         !is.na(incomem))

ICT_per_industry_plot  = df2 %>% 
  group_by(year, euklems_num) %>%
  filter(euklems_num != "NA",
         ICT_hours > 0) %>%
  ggplot(aes(x=year,y=ICT_hours, na.rm = TRUE)) +
  geom_line()+ 
  scale_y_log10() +
  xlab("Year") +
  ylab("ICT per hours worked (logged)") +
  facet_wrap(~ as.character(euklems_num), ncol = 5) + 
  theme_bw() +
  theme(strip.text.x = element_text(size = 8, colour = "black"))
ggsave(paste(path, "ICT_per_industry.eps", sep=""))
ggsave(paste(path, "ICT_per_industry.png", sep=""))

# Appendix Figure 6 ----

tech$agegrp <- NA
tech$agegrp[tech$age>14&tech$age<30] <- "a) young 15-29"
tech$agegrp[tech$age>29&tech$age<50] <- "b) mid 30-49"
tech$agegrp[tech$age>49&tech$age<66] <- "c) old 50-65"

tech$decade <- NA
tech$decade[tech$year>=1990&tech$year<=1999] <- "1990s"
tech$decade[tech$year>=2000&tech$year<=2009] <- "2000s"
tech$decade[tech$year>=2010&tech$year<=2019] <- "2010s"

defaultcols <- hue_pal()(3)


agegrpshares <- tech %>% group_by(agegrp, decade, task3) %>% filter(!is.na(task3)&!is.na(agegrp)) %>% tally(task3)
agegrpshares <- agegrpshares %>% group_by(agegrp, decade) %>% mutate(totalyear=sum(n))
agegrpshares$shares <- agegrpshares$n/agegrpshares$totalyear

ggplot(agegrpshares, aes(x=task3, y=shares, group=factor(task3), fill=factor(task3))) + 
  geom_bar(stat = "identity") + 
  facet_wrap(~agegrp+decade) + 
  scale_fill_manual(name ="Task Group", values = c(defaultcols[1], defaultcols[2], defaultcols[3]),labels=c("Non-Routine Manual", "Routine", "Non-Routine-Cognitive")) +
  xlab("") + ylab("Relative Share") +
  theme_bw()
ggsave(paste(path, "shares_agegroup_decade.eps", sep=""))

agegrpshares$delta <- NA
rows <- which(agegrpshares$decade=="2010s")
for (i in rows) {
  agegrpshares$delta[i] <- round(agegrpshares[i,6]-agegrpshares[i-6,6], 3)
}  
agegrpshares$deltav <- as.numeric(agegrpshares$delta)


ggplot(agegrpshares, aes(x=task3, y=deltav, group=factor(task3), fill=factor(task3))) + 
  geom_bar(stat = "identity") + 
  facet_wrap(~agegrp) + 
  scale_fill_manual(name ="Task Group", values = c(defaultcols[1], defaultcols[2], defaultcols[3]),labels=c("Non-Routine Manual", "Routine", "Non-Routine-Cognitive")) +
  ylab("Change in Relative Share 1990 to 2010s") + xlab("") + theme_bw()
ggsave(paste(path, "change_shares_agegroup_decade.eps", sep=""))

